library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
indata <- read.csv("/Users/eric/Desktop/rates/FertilityRates.csv")
# Check if data import was as expected
dim(indata)
## [1] 219 56
head(indata)
## Country.Name Country.Code Indicator.Name
## 1 Aruba ABW Fertility rate, total (births per woman)
## 2 Andorra AND Fertility rate, total (births per woman)
## 3 Afghanistan AFG Fertility rate, total (births per woman)
## 4 Angola AGO Fertility rate, total (births per woman)
## 5 Albania ALB Fertility rate, total (births per woman)
## 6 United Arab Emirates ARE Fertility rate, total (births per woman)
## Indicator.Code X1960 X1961 X1962 X1963 X1964 X1965 X1966 X1967 X1968 X1969
## 1 SP.DYN.TFRT.IN 4.820 4.655 4.471 4.271 4.059 3.842 3.625 3.417 3.226 3.054
## 2 SP.DYN.TFRT.IN NA NA NA NA NA NA NA NA NA NA
## 3 SP.DYN.TFRT.IN 7.671 7.671 7.671 7.671 7.671 7.671 7.671 7.671 7.671 7.671
## 4 SP.DYN.TFRT.IN 7.316 7.354 7.385 7.410 7.425 7.430 7.422 7.403 7.375 7.339
## 5 SP.DYN.TFRT.IN 6.186 6.076 5.956 5.833 5.711 5.594 5.483 5.376 5.268 5.160
## 6 SP.DYN.TFRT.IN 6.928 6.910 6.893 6.877 6.861 6.841 6.816 6.783 6.738 6.679
## X1970 X1971 X1972 X1973 X1974 X1975 X1976 X1977 X1978 X1979 X1980 X1981 X1982
## 1 2.908 2.788 2.691 2.613 2.552 2.506 2.472 2.446 2.425 2.408 2.392 2.377 2.364
## 2 NA NA NA NA NA NA NA NA NA NA NA NA NA
## 3 7.671 7.671 7.671 7.671 7.671 7.671 7.670 7.670 7.670 7.669 7.669 7.670 7.671
## 4 7.301 7.264 7.232 7.208 7.192 7.185 7.186 7.189 7.194 7.197 7.200 7.201 7.203
## 5 5.050 4.933 4.809 4.677 4.538 4.393 4.244 4.094 3.947 3.807 3.678 3.562 3.460
## 6 6.605 6.512 6.402 6.279 6.146 6.009 5.873 5.744 5.624 5.517 5.423 5.344 5.274
## X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993 X1994 X1995
## 1 2.353 2.342 2.332 2.320 2.307 2.291 2.272 2.249 2.221 2.187 2.149 2.108 2.064
## 2 NA NA NA NA NA NA NA NA NA NA NA NA NA
## 3 7.673 7.676 7.679 7.681 7.682 7.682 7.682 7.687 7.700 7.725 7.758 7.796 7.832
## 4 7.205 7.207 7.208 7.206 7.202 7.194 7.182 7.165 7.143 7.116 7.087 7.054 7.019
## 5 3.372 3.297 3.233 3.177 3.126 3.075 3.023 2.970 2.917 2.867 2.819 2.772 2.723
## 6 5.209 5.141 5.065 4.973 4.860 4.724 4.566 4.388 4.193 3.989 3.784 3.583 3.393
## X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006 X2007 X2008
## 1 2.021 1.979 1.940 1.905 1.874 1.848 1.825 1.805 1.786 1.769 1.754 1.739 1.726
## 2 NA NA NA NA NA NA NA NA NA NA 1.240 1.180 1.250
## 3 7.859 7.869 7.854 7.809 7.733 7.623 7.484 7.321 7.136 6.930 6.702 6.456 6.196
## 4 6.984 6.949 6.913 6.878 6.844 6.811 6.778 6.743 6.704 6.657 6.598 6.523 6.434
## 5 2.670 2.611 2.543 2.467 2.383 2.291 2.195 2.097 2.004 1.919 1.849 1.796 1.761
## 6 3.215 3.052 2.902 2.766 2.644 2.532 2.428 2.329 2.236 2.149 2.071 2.004 1.948
## X2009 X2010 X2011
## 1 1.713 1.701 1.690
## 2 1.190 1.220 NA
## 3 5.928 5.659 5.395
## 4 6.331 6.218 6.099
## 5 1.744 1.741 1.748
## 6 1.903 1.868 1.841
** This data has 219 observations and 56 variables,but it seems like it is not yet in the tidy format. **
# Is it a data frame or a tibble?
is.data.frame(indata)
## [1] TRUE
** It checks whether the input data is a data frame or not. **
indata <- as_tibble(indata)
is_tibble(indata)
## [1] TRUE
** The ‘indata’ is in a data frame, and want to convert it into a tibble.**
# Summary of data
summary(indata)
## Country.Name Country.Code Indicator.Name Indicator.Code
## Length:219 Length:219 Length:219 Length:219
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## X1960 X1961 X1962 X1963
## Min. :1.940 Min. :1.940 Min. :1.790 Min. :1.810
## 1st Qu.:4.210 1st Qu.:4.027 1st Qu.:4.123 1st Qu.:4.057
## Median :6.179 Median :6.144 Median :6.122 Median :6.104
## Mean :5.512 Mean :5.492 Mean :5.492 Mean :5.488
## 3rd Qu.:6.803 3rd Qu.:6.803 3rd Qu.:6.821 3rd Qu.:6.821
## Max. :8.187 Max. :8.194 Max. :8.197 Max. :8.198
## NA's :25 NA's :24 NA's :25 NA's :26
## X1964 X1965 X1966 X1967
## Min. :1.790 Min. :1.740 Min. :1.580 Min. :1.800
## 1st Qu.:3.950 1st Qu.:3.821 1st Qu.:3.644 1st Qu.:3.549
## Median :6.061 Median :6.079 Median :6.045 Median :5.995
## Mean :5.442 Mean :5.392 Mean :5.338 Mean :5.294
## 3rd Qu.:6.801 3rd Qu.:6.799 3rd Qu.:6.795 3rd Qu.:6.747
## Max. :8.198 Max. :8.198 Max. :8.198 Max. :8.201
## NA's :25 NA's :25 NA's :25 NA's :25
## X1968 X1969 X1970 X1971
## Min. :1.830 Min. :1.851 Min. :1.828 Min. :1.703
## 1st Qu.:3.394 1st Qu.:3.247 1st Qu.:3.093 1st Qu.:2.999
## Median :5.912 Median :5.798 Median :5.745 Median :5.681
## Mean :5.240 Mean :5.187 Mean :5.134 Mean :5.074
## 3rd Qu.:6.738 3rd Qu.:6.687 3rd Qu.:6.693 3rd Qu.:6.665
## Max. :8.207 Max. :8.217 Max. :8.231 Max. :8.252
## NA's :25 NA's :25 NA's :25 NA's :24
## X1972 X1973 X1974 X1975
## Min. :1.593 Min. :1.504 Min. :1.510 Min. :1.450
## 1st Qu.:3.026 1st Qu.:2.943 1st Qu.:2.851 1st Qu.:2.712
## Median :5.521 Median :5.416 Median :5.340 Median :5.234
## Mean :5.022 Mean :4.962 Mean :4.904 Mean :4.838
## 3rd Qu.:6.713 3rd Qu.:6.710 3rd Qu.:6.676 3rd Qu.:6.674
## Max. :8.278 Max. :8.307 Max. :8.339 Max. :8.370
## NA's :23 NA's :25 NA's :25 NA's :25
## X1976 X1977 X1978 X1979
## Min. :1.440 Min. :1.400 Min. :1.380 Min. :1.380
## 1st Qu.:2.591 1st Qu.:2.506 1st Qu.:2.472 1st Qu.:2.453
## Median :5.159 Median :5.093 Median :5.022 Median :4.884
## Mean :4.791 Mean :4.713 Mean :4.657 Mean :4.610
## 3rd Qu.:6.635 3rd Qu.:6.582 3rd Qu.:6.535 3rd Qu.:6.483
## Max. :8.399 Max. :8.474 Max. :8.667 Max. :8.843
## NA's :25 NA's :25 NA's :25 NA's :25
## X1980 X1981 X1982 X1983
## Min. :1.440 Min. :1.430 Min. :1.410 Min. :1.330
## 1st Qu.:2.401 1st Qu.:2.370 1st Qu.:2.390 1st Qu.:2.346
## Median :4.755 Median :4.607 Median :4.505 Median :4.456
## Mean :4.559 Mean :4.495 Mean :4.436 Mean :4.393
## 3rd Qu.:6.439 3rd Qu.:6.364 3rd Qu.:6.309 3rd Qu.:6.272
## Max. :8.993 Max. :9.108 Max. :9.185 Max. :9.223
## NA's :25 NA's :23 NA's :20 NA's :23
## X1984 X1985 X1986 X1987
## Min. :1.290 Min. :1.370 Min. :1.340 Min. :1.280
## 1st Qu.:2.294 1st Qu.:2.301 1st Qu.:2.257 1st Qu.:2.274
## Median :4.359 Median :4.224 Median :4.112 Median :3.985
## Mean :4.340 Mean :4.279 Mean :4.219 Mean :4.149
## 3rd Qu.:6.220 3rd Qu.:6.186 3rd Qu.:6.061 3rd Qu.:5.905
## Max. :9.223 Max. :9.186 Max. :9.119 Max. :9.030
## NA's :23 NA's :23 NA's :23 NA's :19
## X1988 X1989 X1990 X1991
## Min. :1.320 Min. :1.280 Min. :1.260 Min. :1.270
## 1st Qu.:2.232 1st Qu.:2.200 1st Qu.:2.180 1st Qu.:2.121
## Median :3.908 Median :3.752 Median :3.558 Median :3.485
## Mean :4.099 Mean :4.024 Mean :3.956 Mean :3.875
## 3rd Qu.:5.840 3rd Qu.:5.727 3rd Qu.:5.577 3rd Qu.:5.442
## Max. :8.925 Max. :8.805 Max. :8.667 Max. :8.504
## NA's :23 NA's :23 NA's :20 NA's :20
## X1992 X1993 X1994 X1995
## Min. :1.290 Min. :1.250 Min. :1.200 Min. :1.180
## 1st Qu.:2.120 1st Qu.:2.027 1st Qu.:1.960 1st Qu.:1.879
## Median :3.330 Median :3.279 Median :3.147 Median :3.082
## Mean :3.791 Mean :3.729 Mean :3.645 Mean :3.561
## 3rd Qu.:5.259 3rd Qu.:5.191 3rd Qu.:5.056 3rd Qu.:4.956
## Max. :8.311 Max. :8.088 Max. :7.841 Max. :7.832
## NA's :18 NA's :21 NA's :20 NA's :18
## X1996 X1997 X1998 X1999
## Min. :1.150 Min. :1.090 Min. :1.017 Min. :0.982
## 1st Qu.:1.913 1st Qu.:1.854 1st Qu.:1.806 1st Qu.:1.796
## Median :2.989 Median :2.869 Median :2.855 Median :2.804
## Mean :3.517 Mean :3.422 Mean :3.379 Mean :3.337
## 3rd Qu.:4.864 3rd Qu.:4.636 3rd Qu.:4.593 3rd Qu.:4.545
## Max. :7.859 Max. :7.869 Max. :7.854 Max. :7.809
## NA's :21 NA's :17 NA's :20 NA's :19
## X2000 X2001 X2002 X2003
## Min. :0.939 Min. :0.891 Min. :0.856 Min. :0.838
## 1st Qu.:1.778 1st Qu.:1.780 1st Qu.:1.759 1st Qu.:1.770
## Median :2.679 Median :2.614 Median :2.558 Median :2.524
## Mean :3.252 Mean :3.199 Mean :3.134 Mean :3.103
## 3rd Qu.:4.352 3rd Qu.:4.268 3rd Qu.:4.134 3rd Qu.:4.050
## Max. :7.733 Max. :7.704 Max. :7.681 Max. :7.658
## NA's :17 NA's :18 NA's :15 NA's :17
## X2004 X2005 X2006 X2007
## Min. :0.836 Min. :0.849 Min. :0.874 Min. :0.906
## 1st Qu.:1.782 1st Qu.:1.775 1st Qu.:1.800 1st Qu.:1.797
## Median :2.518 Median :2.496 Median :2.425 Median :2.422
## Mean :3.075 Mean :3.038 Mean :2.995 Mean :2.961
## 3rd Qu.:3.987 3rd Qu.:3.980 3rd Qu.:3.911 3rd Qu.:3.818
## Max. :7.636 Max. :7.617 Max. :7.602 Max. :7.593
## NA's :18 NA's :16 NA's :14 NA's :13
## X2008 X2009 X2010 X2011
## Min. :0.939 Min. :0.973 Min. :1.003 Min. :1.031
## 1st Qu.:1.796 1st Qu.:1.800 1st Qu.:1.800 1st Qu.:1.796
## Median :2.384 Median :2.374 Median :2.344 Median :2.334
## Mean :2.935 Mean :2.904 Mean :2.876 Mean :2.854
## 3rd Qu.:3.733 3rd Qu.:3.691 3rd Qu.:3.665 3rd Qu.:3.633
## Max. :7.588 Max. :7.585 Max. :7.584 Max. :7.581
## NA's :14 NA's :14 NA's :15 NA's :17
** Summary of the ‘indata’ dataset, including data for variables from 1960 to 2011, featuring the mean, median, and interquartile range (IQR). This summary will be helpful for graph plotting. **
# Convert Country Name into a factor
indata$Country.Name <- as.factor(indata$Country.Name)
summary(indata$Country.Name)
## Afghanistan Albania Algeria
## 1 1 1
## American Samoa Andorra Angola
## 1 1 1
## Antigua and Barbuda Argentina Armenia
## 1 1 1
## Aruba Australia Austria
## 1 1 1
## Azerbaijan Bahamas, The Bahrain
## 1 1 1
## Bangladesh Barbados Belarus
## 1 1 1
## Belgium Belize Benin
## 1 1 1
## Bermuda Bhutan Bolivia
## 1 1 1
## Bosnia and Herzegovina Botswana Brazil
## 1 1 1
## Brunei Darussalam Bulgaria Burkina Faso
## 1 1 1
## Burundi Cabo Verde Cambodia
## 1 1 1
## Cameroon Canada Cayman Islands
## 1 1 1
## Central African Republic Chad Channel Islands
## 1 1 1
## Chile China Colombia
## 1 1 1
## Comoros Congo, Dem. Rep. Congo, Rep.
## 1 1 1
## Costa Rica Cote d'Ivoire Croatia
## 1 1 1
## Cuba Curacao Cyprus
## 1 1 1
## Czech Republic Denmark Djibouti
## 1 1 1
## Dominica Dominican Republic Ecuador
## 1 1 1
## Egypt, Arab Rep. El Salvador Equatorial Guinea
## 1 1 1
## Eritrea Estonia Ethiopia
## 1 1 1
## Faeroe Islands Fiji Finland
## 1 1 1
## France French Polynesia Gabon
## 1 1 1
## Gambia, The Georgia Germany
## 1 1 1
## Ghana Greece Greenland
## 1 1 1
## Grenada Guam Guatemala
## 1 1 1
## Guinea Guinea-Bissau Guyana
## 1 1 1
## Haiti Honduras Hong Kong SAR, China
## 1 1 1
## Hungary Iceland India
## 1 1 1
## Indonesia Iran, Islamic Rep. Iraq
## 1 1 1
## Ireland Isle of Man Israel
## 1 1 1
## Italy Jamaica Japan
## 1 1 1
## Jordan Kazakhstan Kenya
## 1 1 1
## (Other)
## 120
** Converting the ‘Country Name’ variable into a factor (categorical) variable and generating a summary of the variable. **
# Convert Indicator Name into a factor
indata$Indicator.Name <- as.factor(indata$Indicator.Name)
summary(indata$Indicator.Name)
## Fertility rate, total (births per woman)
## 219
** Converting ‘Indicator Name’ into a factor(Categorical) variable and generating a summary of the variable. **
# Remove extra columns
indata_cleaned <- select(indata, -(Country.Code:Indicator.Code))
head(indata_cleaned)
## # A tibble: 6 × 53
## Country.Name X1960 X1961 X1962 X1963 X1964 X1965 X1966 X1967 X1968 X1969 X1970
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba 4.82 4.66 4.47 4.27 4.06 3.84 3.62 3.42 3.23 3.05 2.91
## 2 Andorra NA NA NA NA NA NA NA NA NA NA NA
## 3 Afghanistan 7.67 7.67 7.67 7.67 7.67 7.67 7.67 7.67 7.67 7.67 7.67
## 4 Angola 7.32 7.35 7.38 7.41 7.42 7.43 7.42 7.40 7.38 7.34 7.30
## 5 Albania 6.19 6.08 5.96 5.83 5.71 5.59 5.48 5.38 5.27 5.16 5.05
## 6 United Arab… 6.93 6.91 6.89 6.88 6.86 6.84 6.82 6.78 6.74 6.68 6.60
## # ℹ 41 more variables: X1971 <dbl>, X1972 <dbl>, X1973 <dbl>, X1974 <dbl>,
## # X1975 <dbl>, X1976 <dbl>, X1977 <dbl>, X1978 <dbl>, X1979 <dbl>,
## # X1980 <dbl>, X1981 <dbl>, X1982 <dbl>, X1983 <dbl>, X1984 <dbl>,
## # X1985 <dbl>, X1986 <dbl>, X1987 <dbl>, X1988 <dbl>, X1989 <dbl>,
## # X1990 <dbl>, X1991 <dbl>, X1992 <dbl>, X1993 <dbl>, X1994 <dbl>,
## # X1995 <dbl>, X1996 <dbl>, X1997 <dbl>, X1998 <dbl>, X1999 <dbl>,
## # X2000 <dbl>, X2001 <dbl>, X2002 <dbl>, X2003 <dbl>, X2004 <dbl>, …
** From 56 variables we are removing extra columns for better analysis **
# Pivot to long dataset
indata_pivoted <- pivot_longer(indata_cleaned, c(str_c("X", c(1960:2011))), names_to = "Year", values_to="Fertility.Rates")
head(indata_pivoted)
## # A tibble: 6 × 3
## Country.Name Year Fertility.Rates
## <fct> <chr> <dbl>
## 1 Aruba X1960 4.82
## 2 Aruba X1961 4.66
## 3 Aruba X1962 4.47
## 4 Aruba X1963 4.27
## 5 Aruba X1964 4.06
## 6 Aruba X1965 3.84
** Changing the Columns(1960 - 2011) of the data frame into rows by using ‘pivot_longer’ function to check the fertility rates of countries.**
# Format Year values
indata_pivoted$Year <- as.integer(str_sub(indata_pivoted$Year, 2,5))
# Check Missing Values
sum(is.na(indata_pivoted$Fertility.Rates))
## [1] 1104
** Taking substring from the string of the year. There are 1104 missing values in the data frame. **
# Check missing values by country
## Step1: Filter by missing values
## setp 2: group by country
## Step3: Count number of values
indata_pivoted %>% filter(is.na(Fertility.Rates)) %>% group_by(Country.Name) %>% summarise(count = n())
## # A tibble: 27 × 2
## Country.Name count
## <fct> <int>
## 1 American Samoa 52
## 2 Andorra 47
## 3 Bermuda 43
## 4 Cayman Islands 52
## 5 Curacao 47
## 6 Dominica 45
## 7 Faeroe Islands 52
## 8 Greenland 30
## 9 Isle of Man 49
## 10 Kosovo 21
## # ℹ 17 more rows
** It shows 27 countries have missing values and they are represented in rows by giving the count for each country missing values. **
# Fill missing values within countries
indata_filled <- indata_pivoted %>% group_by(Country.Name) %>% fill(Fertility.Rates, .direction ="downup") %>% ungroup()
## Check
indata_filled %>% filter(is.na(Fertility.Rates)) %>% group_by(Country.Name) %>% summarise(count = n())
## # A tibble: 9 × 2
## Country.Name count
## <fct> <int>
## 1 American Samoa 52
## 2 Cayman Islands 52
## 3 Faeroe Islands 52
## 4 Monaco 52
## 5 Northern Mariana Islands 52
## 6 San Marino 52
## 7 Sub-Saharan Africa (IFC classification) 52
## 8 Turks and Caicos Islands 52
## 9 Tuvalu 52
indata_filled <- indata_filled %>% filter(!is.na(Fertility.Rates))
** It shows that 9 countries have the same missing values, with a count of 52 for each. We are removing rows with more missing values for other countries. **
ggplot(data = indata_filled) + geom_point(mapping = aes(x=Year, y = Fertility.Rates))
# Plot Fertility Rates by Year
ggplot(data = indata_filled) + geom_point(mapping = aes(x=Year, y = Fertility.Rates), position = "jitter", alpha = 0.1) + geom_smooth(mapping = aes(x=Year, y = Fertility.Rates)) + labs(x = "Year", y = "Fertility Rates", title = "Fertility Rates over the years", subtitle = "Global fertility rates have decreased since 1960")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
** Global fertility rates have been decreased from the year 1960 to 2011. **
# Need a way to categorize 210 countries
## Option 1: subset the data
indata_subset <- filter(indata_filled, Country.Name %in% c("United States", "Mexico", "Canada"))
ggplot(data = indata_subset) + geom_line(mapping = aes(x=Year, y = Fertility.Rates, color = Country.Name), alpha = 0.5) + labs(x = "Year", y = "Fertility Rates", title = "Fertility Rates over the years", subtitle = "Global fertility rates have decreased since 1960")
** Global fertility rates has been decreasing for the countries United States, Mexico, and Canada since 1960. **
## Option 2: Select based on statistics
# Top 10
indata_filled %>% group_by(Country.Name) %>% summarise(avg = mean(Fertility.Rates)) %>% arrange(desc(avg)) %>% print(n = 10)
## # A tibble: 210 × 2
## Country.Name avg
## <fct> <dbl>
## 1 Niger 7.59
## 2 Afghanistan 7.47
## 3 Yemen, Rep. 7.43
## 4 Somalia 7.26
## 5 Rwanda 7.23
## 6 Burundi 7.19
## 7 Angola 7.06
## 8 Uganda 6.94
## 9 Mali 6.92
## 10 Chad 6.92
## # ℹ 200 more rows
** Average for each country fertility rate. **
# Bottom 10
indata_filled %>% group_by(Country.Name) %>% summarise(avg = mean(Fertility.Rates)) %>% arrange(desc(avg)) %>% top_n(10) %>% ggplot() + geom_bar(mapping = aes(x = Country.Name, y = avg), stat = "identity") +coord_flip()
## Selecting by avg
** Bar graph displays a plot of the selected average fertility rates along with the country names.**
### Option
library(countrycode)
indata_df <- as.data.frame(indata_filled)
indata_df$Continent.Name <- factor(countrycode(sourcevar = indata_df[,"Country.Name"], origin = "country.name", destination = "continent"))
## Warning: Some values were not matched unambiguously: Channel Islands, Kosovo, Latin America & Caribbean (all income levels), OECD members, Other small states, Pacific island small states
indata_df$Region.Name <- factor(countrycode(sourcevar = indata_df[,"Country.Name"], origin = "country.name", destination = "region"))
## Warning: Some values were not matched unambiguously: Latin America & Caribbean (all income levels), OECD members, Other small states, Pacific island small states
head(indata_df)
## Country.Name Year Fertility.Rates Continent.Name Region.Name
## 1 Aruba 1960 4.820 Americas Latin America & Caribbean
## 2 Aruba 1961 4.655 Americas Latin America & Caribbean
## 3 Aruba 1962 4.471 Americas Latin America & Caribbean
## 4 Aruba 1963 4.271 Americas Latin America & Caribbean
## 5 Aruba 1964 4.059 Americas Latin America & Caribbean
## 6 Aruba 1965 3.842 Americas Latin America & Caribbean
indata <- as_tibble(indata_df)
** Adding the continent name and region name to check the fertility rate instead of country names for more analysis. **
ggplot(data = indata) + geom_point(mapping = aes(x=Year, y = Fertility.Rates, color = Continent.Name), position = "jitter", alpha = 0.6) + labs(x = "Year", y = "Fertility Rates", title = "Fertility Rates over the years", subtitle = "Global fertility rates have decreased since 1960")
** A scatter plot shows how global fertility rates have decreased, based on continent names. **
# Facet Plotting
ggplot(data = indata) + geom_point(mapping = aes(x=Year, y = Fertility.Rates, color = Country.Name), position = "jitter", alpha = 0.2, show.legend = FALSE) + facet_wrap(~ Region.Name)
** Divided the countries into various regions and then created plots. **
# Filtering and Potting
indata %>% filter(Region.Name == "Middle East & North Africa" & Continent.Name == "Asia") %>% ggplot() + geom_line(mapping = aes(x =Year, y = Fertility.Rates, color = Country.Name), size = 1, linetype = 2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
** By considering the continent and region, we identified 14 countries. The plot reveals a bell-shaped trend from 1960 to 2000, followed by a decrease. **
# Boxplot
ggplot(data = indata, mapping = aes(x = Continent.Name, y = Fertility.Rates)) + geom_boxplot() + coord_flip()
** The African continent has the highest fertility rates, averaging around 7, while Asia exhibits a wide range of fertility rates, ranging from 2.6 to 6. **
# Histogram
ggplot(data = indata) + geom_histogram(mapping = aes(x = Fertility.Rates), binwidth = 0.5)
** The histogram graph displays a right-skewed distribution, indicating that the average fertility rate tends to be higher, towards the right. **
# Barchart
indata %>% group_by(Region.Name) %>% summarise(avg = mean(Fertility.Rates)) %>% ggplot(mapping = aes(x = reorder(Region.Name, -avg), y = avg, fill=Region.Name)) + geom_bar(stat = "identity") + coord_flip()
** Bar chart displaying the average fertility rate, grouped by region name after factorization. **